**THIS FILE CONSTRUCTS THE ZIP CODE LEVEL INCOME AND POPULATION DATA**

/*Raw Data Sources:
-SOI Zip Code Level Data, Downloaded 9-24-14

last update to this do-file: 9-25-2014
*/

cd $datafolder
 
use "zpallagi_short_09242014", clear
destring(zipcode), replace

gen frac_50k=0
replace frac_50k=1 if (agi_stub==4) | (agi_stub==5) | (agi_stub==6) | (agi_stub==7) | (agi_stub==9) | (agi_stub==10) 
gen frac_100k=0
replace frac_100k=1 if(agi_stub==6) | (agi_stub==7) | (agi_stub==9) 
replace frac_100k=. if year<=2001

replace frac_50k=frac_50k*n1
replace frac_100k=frac_100k*n1


collapse (sum) a00100 (sum) a00200 (sum) a00300 (sum) n1 (sum) n2 (sum) frac_50k (sum) frac_100k, by(zipcode year)

rename a00100 mean_agi
rename a00200 mean_wageinc
rename a00300 mean_intinc
rename n1 numberhh
rename n2 numberpop

**replace fractions
replace frac_50k=frac_50k/numberhh
replace frac_100k=frac_100k/numberhh


**The AGI and Income numbers are totals, in thousands, in each bin, sum and divide by population to get mean income

replace mean_agi=(mean_agi*1000)/numberhh
replace mean_wageinc=(mean_wageinc*1000)/numberhh
replace mean_intinc=(mean_intinc*1000)/numberhh

replace mean_agi=mean_agi/1000 if year==2008
replace mean_wageinc=mean_wageinc/1000 if year==2008
replace mean_intinc=mean_intinc/1000 if year==2008



**merge CPI-U, deflate
merge m:1 year using "cpi.dta"
drop if _merge==2
drop _merge

**put in 2013 dollars
foreach n in mean_agi mean_wageinc mean_intinc {
replace `n'=`n'*(232.957/cpi)
}

drop cpi

sort zipcode year
drop if zipcode==0

merge m:1 zipcode using "zipcodedata2000_2.dta"
keep if _merge==3
drop _merge


reshape wide mean_agi mean_wageinc mean_intinc numberhh numberpop frac_50k frac_100k, i(zipcode) j(year)

foreach n in mean_agi mean_wageinc mean_intinc numberhh numberpop frac_50k frac_100k {
gen `n'1999=`n'1998*(2/3)+`n'2001*(1/3)
gen `n'2000=`n'1998*(1/3)+`n'2001*(2/3)

gen `n'2002=`n'2001*(2/3)+`n'2004*(1/3)
gen `n'2003=`n'2001*(1/3)+`n'2004*(2/3)

gen `n'2009=`n'2008*(2/3)+`n'2011*(1/3)
gen `n'2010=`n'2008*(1/3)+`n'2011*(2/3)
}

reshape long mean_agi mean_wageinc mean_intinc numberhh numberpop frac_50k frac_100k, i(zipcode) j(year)

**generate annual growth rates in pop to apply to census pop numbers, this is how we create time-variant zip code level populations
**use base level population from census 2000, and apply to growth rates in population of tax filers
gen hh2000=numberhh*(year==2000)
gen pop2000=numberpop*(year==2000)
bysort zipcode: egen hhbase=total(hh2000)
bysort zipcode: egen popbase=total(pop2000)
gen gr_hh=numberhh/hhbase
gen gr_pop=numberpop/popbase


replace totalpop=totalpop*gr_pop
replace totalhousing=totalhousing*gr_hh

keep zipcode state year mean_agi mean_wageinc mean_intinc totalpop totalhousing frac_50k frac_100k


label var mean_agi "Mean AGI"
label var mean_wageinc "Mean Wage and Salary Income"
label var mean_intinc "Mean Taxable Interest Income"
label var totalpop "Total population, 2000 census base, inflated using SOI population"
label var totalhousing "Total households, 2000 census base, inflated using SOI population"

save "zipcode_income_pop_data.dta", replace




